library(ggplot2)
library(readr)
library(dplyr)
library(tidyr)
library(forcats)
library(DT)
library(tibble)
library(cowplot)

theme_set(theme_bw() + 
            theme(legend.title = element_blank(),
                  panel.grid.minor = element_blank()))

Read in files

and convert to wide format

#4CE long format Labs Data
patient_obs <- read_csv('penn-data/labs_long_thrombo_v2.csv') %>% 
  mutate(severity = (severe_ind == 1) %>% 
           as_factor() %>% 
           fct_recode('nonsevere' = 'FALSE', 'severe' = 'TRUE'))

#Code, Descriptions and Ranges
lab_mapping <- read_csv('public-data/loinc-map.csv')
load('public-data/lab.range.rda')
# load('public-data/code.dict.rda')
lab_bounds <- lab.range[, -1] %>% 
  colMeans() %>% 
  data.frame(value = .) %>% 
  rownames_to_column() %>% 
  separate(col = rowname, into = c('LOINC', 'bound'), sep = '_') %>% 
  mutate(LOINC = gsub('loinc', '', LOINC) %>% gsub('\\.', '-', .),
         value = exp(value)) %>% 
  pivot_wider(names_from = 'bound', values_from = 'value') %>%
  left_join(lab_mapping, by = 'LOINC') %>%
  {.}
patient_obs_wide <- patient_obs %>% 
  left_join(lab_bounds, by = c('concept_code' = 'LOINC')) %>% 
  select(- concept_code) %>% 
  pivot_wider(id_cols = c(patient_num, days_since_admission, severity),
              names_from = short_name,
              values_from = value,
              values_fn = mean)

#check NAs in the Wide format
na_stats <- patient_obs_wide %>% 
  select(- c(patient_num, days_since_admission, severity)) %>% 
  is.na() %>% 
  `!` 

na_df <- data.frame(value_existed = colSums(na_stats), 
           prop_existed = colMeans(na_stats)) %>% 
  rownames_to_column('lab') %>% 
  mutate(prop_na = 1 - prop_existed,
         lab = fct_reorder(lab, value_existed))

n_values <- na_df %>% 
  ggplot(aes(x = value_existed, y = lab)) +
  geom_col() + 
  labs(x = 'Number of values', y = NULL)

na_prob <- na_df %>%
  rename('Valid value' = prop_existed, 'NA' = prop_na) %>% 
  pivot_longer(c(`Valid value`, `NA`)) %>% 
  ggplot(aes(x = value, y = lab, fill = name)) +
  geom_col() + 
  scale_fill_discrete(guide = guide_legend(reverse = TRUE)) +
  labs(x = 'Proportion', y = NULL) +
  # guides(fill = guide_legend(reverse = TRUE))
  theme(
    axis.text.y = element_blank(),
    legend.key.width = unit(6, 'mm'),
    legend.key.height = unit(4, 'mm'),
    legend.position = 'bottom')

plot_grid(n_values, na_prob, nrow = 1, axis = 'b', align = 'h')

penn_na_df <- na_df

Number of observation (days) per patient

days_count_min_max <- patient_obs_wide %>%
  group_by(patient_num, severity) %>%
  summarise(
    n_values = n_distinct(days_since_admission),
    min_day = min(days_since_admission),
    max_day = max(days_since_admission),
    .groups = 'drop'
  ) %>% 
  mutate(time_obs = max_day - min_day) %>% 
  select(-patient_num) %>% 
  add_count(severity, name = 'n_severity') 

agg_n_values <- days_count_min_max %>% 
  count(severity, n_severity, n_values,
        name = 'n_nvals')
agg_max_day <- days_count_min_max %>% 
  count(severity, n_severity, max_day,
        name = 'n_maxday')

(n_severe <- sum(days_count_min_max$severity == 'severe'))
## [1] 817
(n_nonsevere <- sum(days_count_min_max$severity == 'nonsevere'))
## [1] 1876
# summary(days_count_min_max$n_values)

Histogram of the number of days with at least one observation

agg_n_values %>% 
  ggplot(aes(x = n_values, y = n_nvals, fill = severity)) +
  geom_col(alpha = 0.5) +
  scale_fill_brewer(palette = 'Dark2', direction = -1) +
  labs(fill = 'Severe?', 
       x = "Number of days with data", 
       y = "Count")

Histogram of length of stay

i.e. last day with observation-first day with observation

We need to check for readmission here.

agg_max_day %>% 
  ggplot(aes(x = max_day, y = n_maxday, fill = severity)) +
  geom_col(alpha = 0.5) +
  scale_fill_brewer(palette = 'Dark2', direction = -1) +
  labs(fill = 'Severe?', 
       x = "Number of days with data", 
       y = "Count")

Analyze missingness and frequency of measures for each lab

patient_obs_long <- patient_obs_wide %>% 
  pivot_longer(-c(patient_num, days_since_admission, severity),
               names_to = 'lab', values_to = 'value',
               values_drop_na = TRUE)
  
per_lab <- patient_obs_long %>% 
  group_by(lab, patient_num, severity) %>% 
  count(name = 'n_obs') %>% 
  ungroup() %>% 
  group_by(lab, severity) %>% 
  count(n_obs) %>% 
  ungroup() %>% 
  pivot_wider(names_from = severity, values_from = n, values_fill = 0) %>% 
  mutate(both_severities = nonsevere + severe) %>% 
  mutate(prop_nonsevere = nonsevere/n_nonsevere,
         prop_severe = severe/n_severe,
         prop_both = both_severities/nrow(days_count_min_max))

lab_medians <-
  patient_obs_long %>% 
  add_count(lab, name = 'total_obs') %>% 
  group_by(lab) %>% 
  mutate(total_patients = length(unique(patient_num))) %>% 
  add_count(patient_num, name = 'n_obs_patients') %>% 
  mutate(median_obs_per_patient = median(n_obs_patients),
         n_greater0 = n_obs_patients > median_obs_per_patient,
         n_greater1 = n_obs_patients > median_obs_per_patient + 1,
         n_greater2 = n_obs_patients > median_obs_per_patient + 2) %>% 
  group_by(severity) %>% 
  mutate(each_med_obs_per_patient = median(n_obs_patients)) %>% 
  ungroup(severity) %>% 
  select(- c(days_since_admission, value)) %>% 
  distinct() %>% 
  group_by(lab) %>% 
  mutate(across(contains('n_greater'), sum)) %>% 
  select(- c(n_obs_patients, patient_num)) %>% 
  distinct() %>% 
  pivot_wider(names_from = severity, values_from = each_med_obs_per_patient) %>% 
  rename('median_obs_per_severe_patient' = severe,
         'median_obs_per_non_severe_patient' = nonsevere) %>% 
  select(lab, total_obs, total_patients, starts_with('med'), starts_with('n_'))

lab_medians %>% 
  datatable(rownames = FALSE)

n_greater0 shows the number of patients who had more observations than the median. n_greater1 shows the number of patients who had more observations than the median + 1. n_greater2 shows the number of patients who had more observations than the median + 2.

In the figure below:

  • Grey dash line: Reference Low
  • Grey solid line: Reference High
  • Black dash line: lower bound outlier (QC)
  • Black solid line: upper bound outlier (QC)
patient_obs_long %>% 
  left_join(lab_bounds, by = c('lab' = 'short_name')) %>% 
  ggplot(aes(y = severity, x = value, fill = severity)) +
  geom_violin() +
  scale_fill_brewer(palette = 'Dark2', guide = guide_legend(reverse = TRUE)) +
  labs(y = NULL, x = NULL) +
  geom_vline(aes(xintercept = `Reference Low`), linetype = 'dashed', color = 'grey') +
  geom_vline(aes(xintercept = `Reference High`), color = 'grey') +
  geom_vline(aes(xintercept = LB), linetype = 'dashed') +
  geom_vline(aes(xintercept = UB)) +
  facet_wrap(~ lab, scales = 'free', ncol = 2, strip.position = 'left') +
  theme(axis.text.y = element_blank())

Missing data heatmap

Capped at 20 days with observations.

per_lab %>% 
  select(lab, n_obs, both_severities, severe, nonsevere) %>% 
  filter(n_obs <= 20) %>% 
  pivot_longer(c(both_severities, severe, nonsevere)) %>% 
  mutate(name = name %>% fct_recode(
    'All patients' = 'both_severities',
    'Severe patients' = 'severe',
    'Non-severe patients' = 'nonsevere'
  )) %>% 
  ggplot(aes(x = n_obs, fill = value, y = fct_reorder(lab, n_obs))) + 
  geom_tile(colour = "white", size = 0.2)+
  geom_text(aes(label = value), colour = "white", size = 2) +
  scale_y_discrete(expand = c(0, 0))+
  scale_x_continuous(expand = c(0, 0),
                     breaks = c(1:max(per_lab$n_obs)),
                     labels = c(1:max(per_lab$n_obs)))+
  scale_fill_gradient(low = "lightgrey", high = "darkblue",
                      limits = c(0, max(per_lab$both_severities))) +
  facet_wrap(~ name, nrow = 1) +
  labs(x = 'Number of values a patient has for each lab',
       y = NULL, fill = '# patients') +
  theme(panel.grid.major = element_blank(),
        legend.position = c(0.93, 0.2),
        axis.ticks.y = element_blank()
  )

“Binned” heatmap: every 15 days

per_lab %>%
  mutate(obs_bin = cut(n_obs, breaks = 15, include.lowest = TRUE)) %>% 
  group_by(lab, obs_bin) %>% 
  summarise(both_severities = sum(both_severities), 
            severe = sum(severe), 
            nonsevere = sum(nonsevere), 
            .groups = 'drop') %>% 
  select(lab, obs_bin, both_severities, severe, nonsevere) %>% 
  pivot_longer(c(both_severities, severe, nonsevere)) %>% 
  mutate(name = name %>% fct_recode(
    'All patients' = 'both_severities',
    'Severe patients' = 'severe',
    'Non-severe patients' = 'nonsevere'
  )) %>% 
  ggplot(aes(x = obs_bin, fill = value, y = fct_reorder(lab, value))) + 
  geom_tile(colour = "white", size = 0.2) +
  geom_text(aes(label = value), colour = "white", size = 2) +
  scale_y_discrete(expand = c(0, 0))+
  scale_fill_gradient(low = "lightgrey", high = "darkblue") +
  facet_wrap(~ name, nrow = 1) +
  labs(x = 'Number of values a patient has for each lab',
       y = NULL, fill = '# patients') +
  theme(panel.grid.major = element_blank(),
        legend.position = c(0.93, 0.2),
        axis.text.x = element_text(angle = 90, hjust = 1),
        axis.ticks.y = element_blank()
  )

boxplot(per_lab$n_obs)

sum(per_lab$n_obs > 90)
## [1] 15
per_lab %>% 
  select(lab, n_obs, prop_both, prop_severe, prop_nonsevere) %>% 
  filter(n_obs <= 20) %>% 
  pivot_longer(c(prop_both, prop_severe, prop_nonsevere)) %>% 
  mutate(name = name %>% fct_recode(
    'Compared to all patients' = 'prop_both',
    'Compared to all severe patients' = 'prop_severe',
    'Compared to all non-severe patients' = 'prop_nonsevere'
  )) %>% 
  ggplot(aes(x = n_obs, fill = value, y = fct_reorder(lab, n_obs))) + 
  geom_tile(colour = "white", size = 0.2)+
  geom_text(aes(label = round(value, 2)), colour = "white", size = 2) +
  scale_y_discrete(expand = c(0, 0)) +
  scale_x_continuous(expand = c(0, 0),
                     breaks = c(1:max(per_lab$n_obs)),
                     labels = c(1:max(per_lab$n_obs)))+ 
  scale_fill_gradient(low = "lightgrey", high = "darkblue",
                      labels = scales::percent_format(accuracy = 1L)) +
  facet_wrap(~ name, nrow = 1) +
  labs(x = 'Number of values a patient has for each lab',
       y = NULL, fill = '% patients') +
  theme(panel.grid.major = element_blank(),
        legend.position = c(0.93, 0.2),
        axis.ticks.y = element_blank()
  )

per_lab %>%
  mutate(obs_bin = cut(n_obs, breaks = 15, include.lowest = TRUE)) %>% 
  group_by(lab, obs_bin) %>% 
  summarise(prop_both = sum(prop_both), 
            prop_severe = sum(prop_severe), 
            prop_nonsevere = sum(prop_nonsevere), 
            .groups = 'drop') %>% 
  select(lab, obs_bin, prop_both, prop_severe, prop_nonsevere) %>% 
  pivot_longer(c(prop_both, prop_severe, prop_nonsevere)) %>% 
  mutate(name = name %>% fct_recode(
    'Compared to all patients' = 'prop_both',
    'Compared to all severe patients' = 'prop_severe',
    'Compared to all non-severe patients' = 'prop_nonsevere'
  )) %>% 
  ggplot(aes(x = obs_bin, fill = value, y = fct_reorder(lab, value))) + 
  geom_tile(colour = "white", size = 0.2) +
  geom_text(aes(label = round(value, 2)), colour = "white", size = 2) +
  scale_y_discrete(expand = c(0, 0)) +
  scale_fill_gradient(low = "lightgrey", high = "darkblue",
                      labels = scales::percent_format(accuracy = 1L)) +
  facet_wrap(~ name, nrow = 1) +
  labs(x = 'Number of values a patient has for each lab',
       y = NULL, fill = '% patients') +
  theme(panel.grid.major = element_blank(),
        legend.position = c(0.93, 0.2),
        axis.text.x = element_text(angle = 90, hjust = 1),
        axis.ticks.y = element_blank()
  )

Denominator: total number of patients, total number of non-severe patients, and total number of severe patients, respectively.

per_lab %>% 
  select(lab, n_obs, severe, nonsevere) %>% 
  filter(n_obs <= 90) %>%
  pivot_longer(c(severe, nonsevere)) %>% 
  mutate(lab = fct_reorder(lab, n_obs),
         name = name %>% fct_recode(
    'Severe patients' = 'severe',
    'Non-severe patients' = 'nonsevere'
  )) %>% 
  ggplot(aes(x = n_obs, fill = name, y = value)) + 
  geom_col() +
  scale_fill_brewer(palette = 'Dark2', direction = -1) +
  facet_wrap(~ lab, scales = 'free') +
  labs(x = 'Number of values a patient has for each lab',
       y = NULL, fill = '# patients') +
  theme(panel.grid.major = element_blank(),
        legend.position = c(0.9, 0.2),
        axis.ticks.y = element_blank()
  )

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] cowplot_1.1.0 tibble_3.0.4  DT_0.16       forcats_0.5.0 tidyr_1.1.2  
## [6] dplyr_1.0.2   readr_1.4.0   ggplot2_3.3.2
## 
## loaded via a namespace (and not attached):
##  [1] RColorBrewer_1.1-2 pillar_1.4.6       compiler_4.0.3     tools_4.0.3       
##  [5] digest_0.6.27      jsonlite_1.7.1     evaluate_0.14      lifecycle_0.2.0   
##  [9] gtable_0.3.0       pkgconfig_2.0.3    rlang_0.4.8        rstudioapi_0.13   
## [13] cli_2.1.0          crosstalk_1.1.0.1  yaml_2.2.1         xfun_0.19         
## [17] withr_2.3.0        stringr_1.4.0      knitr_1.30         generics_0.1.0    
## [21] vctrs_0.3.4        htmlwidgets_1.5.2  hms_0.5.3          grid_4.0.3        
## [25] tidyselect_1.1.0   glue_1.4.2         R6_2.5.0           fansi_0.4.1       
## [29] rmarkdown_2.5      farver_2.0.3       purrr_0.3.4        magrittr_1.5      
## [33] scales_1.1.1       ellipsis_0.3.1     htmltools_0.5.0    assertthat_0.2.1  
## [37] colorspace_2.0-0   labeling_0.4.2     stringi_1.5.3      munsell_0.5.0     
## [41] crayon_1.3.4
---
title: "Missingness analysis"
author: Amelia Tan, Arianna Dagliati, Trang Le
date: "27 October 2020"
---


```{r setup, warning=FALSE, message=FALSE}
library(ggplot2)
library(readr)
library(dplyr)
library(tidyr)
library(forcats)
library(DT)
library(tibble)
library(cowplot)

theme_set(theme_bw() + 
            theme(legend.title = element_blank(),
                  panel.grid.minor = element_blank()))
```

## Read in files 
and convert to wide format
```{R message=FALSE}
#4CE long format Labs Data
patient_obs <- read_csv('penn-data/labs_long_thrombo_v2.csv') %>% 
  mutate(severity = (severe_ind == 1) %>% 
           as_factor() %>% 
           fct_recode('nonsevere' = 'FALSE', 'severe' = 'TRUE'))

#Code, Descriptions and Ranges
lab_mapping <- read_csv('public-data/loinc-map.csv')
load('public-data/lab.range.rda')
# load('public-data/code.dict.rda')
```

```{r}
lab_bounds <- lab.range[, -1] %>% 
  colMeans() %>% 
  data.frame(value = .) %>% 
  rownames_to_column() %>% 
  separate(col = rowname, into = c('LOINC', 'bound'), sep = '_') %>% 
  mutate(LOINC = gsub('loinc', '', LOINC) %>% gsub('\\.', '-', .),
         value = exp(value)) %>% 
  pivot_wider(names_from = 'bound', values_from = 'value') %>%
  left_join(lab_mapping, by = 'LOINC') %>%
  {.}
```

```{R}
patient_obs_wide <- patient_obs %>% 
  left_join(lab_bounds, by = c('concept_code' = 'LOINC')) %>% 
  select(- concept_code) %>% 
  pivot_wider(id_cols = c(patient_num, days_since_admission, severity),
              names_from = short_name,
              values_from = value,
              values_fn = mean)

#check NAs in the Wide format
na_stats <- patient_obs_wide %>% 
  select(- c(patient_num, days_since_admission, severity)) %>% 
  is.na() %>% 
  `!` 

na_df <- data.frame(value_existed = colSums(na_stats), 
           prop_existed = colMeans(na_stats)) %>% 
  rownames_to_column('lab') %>% 
  mutate(prop_na = 1 - prop_existed,
         lab = fct_reorder(lab, value_existed))

n_values <- na_df %>% 
  ggplot(aes(x = value_existed, y = lab)) +
  geom_col() + 
  labs(x = 'Number of values', y = NULL)

na_prob <- na_df %>%
  rename('Valid value' = prop_existed, 'NA' = prop_na) %>% 
  pivot_longer(c(`Valid value`, `NA`)) %>% 
  ggplot(aes(x = value, y = lab, fill = name)) +
  geom_col() + 
  scale_fill_discrete(guide = guide_legend(reverse = TRUE)) +
  labs(x = 'Proportion', y = NULL) +
  # guides(fill = guide_legend(reverse = TRUE))
  theme(
    axis.text.y = element_blank(),
    legend.key.width = unit(6, 'mm'),
    legend.key.height = unit(4, 'mm'),
    legend.position = 'bottom')

plot_grid(n_values, na_prob, nrow = 1, axis = 'b', align = 'h')

penn_na_df <- na_df
```

## Number of observation (days) per patient
```{R}
days_count_min_max <- patient_obs_wide %>%
  group_by(patient_num, severity) %>%
  summarise(
    n_values = n_distinct(days_since_admission),
    min_day = min(days_since_admission),
    max_day = max(days_since_admission),
    .groups = 'drop'
  ) %>% 
  mutate(time_obs = max_day - min_day) %>% 
  select(-patient_num) %>% 
  add_count(severity, name = 'n_severity') 

agg_n_values <- days_count_min_max %>% 
  count(severity, n_severity, n_values,
        name = 'n_nvals')
agg_max_day <- days_count_min_max %>% 
  count(severity, n_severity, max_day,
        name = 'n_maxday')

(n_severe <- sum(days_count_min_max$severity == 'severe'))
(n_nonsevere <- sum(days_count_min_max$severity == 'nonsevere'))
# summary(days_count_min_max$n_values)
```

## Histogram of the number of days with at least one observation
```{R}
agg_n_values %>% 
  ggplot(aes(x = n_values, y = n_nvals, fill = severity)) +
  geom_col(alpha = 0.5) +
  scale_fill_brewer(palette = 'Dark2', direction = -1) +
  labs(fill = 'Severe?', 
       x = "Number of days with data", 
       y = "Count")
```

## Histogram of length of stay
i.e. last day with observation-first day with observation

We need to check for readmission here.
```{R}
agg_max_day %>% 
  ggplot(aes(x = max_day, y = n_maxday, fill = severity)) +
  geom_col(alpha = 0.5) +
  scale_fill_brewer(palette = 'Dark2', direction = -1) +
  labs(fill = 'Severe?', 
       x = "Number of days with data", 
       y = "Count")
```

## Analyze missingness and frequency of measures for each lab

```{r}
patient_obs_long <- patient_obs_wide %>% 
  pivot_longer(-c(patient_num, days_since_admission, severity),
               names_to = 'lab', values_to = 'value',
               values_drop_na = TRUE)
  
per_lab <- patient_obs_long %>% 
  group_by(lab, patient_num, severity) %>% 
  count(name = 'n_obs') %>% 
  ungroup() %>% 
  group_by(lab, severity) %>% 
  count(n_obs) %>% 
  ungroup() %>% 
  pivot_wider(names_from = severity, values_from = n, values_fill = 0) %>% 
  mutate(both_severities = nonsevere + severe) %>% 
  mutate(prop_nonsevere = nonsevere/n_nonsevere,
         prop_severe = severe/n_severe,
         prop_both = both_severities/nrow(days_count_min_max))

lab_medians <-
  patient_obs_long %>% 
  add_count(lab, name = 'total_obs') %>% 
  group_by(lab) %>% 
  mutate(total_patients = length(unique(patient_num))) %>% 
  add_count(patient_num, name = 'n_obs_patients') %>% 
  mutate(median_obs_per_patient = median(n_obs_patients),
         n_greater0 = n_obs_patients > median_obs_per_patient,
         n_greater1 = n_obs_patients > median_obs_per_patient + 1,
         n_greater2 = n_obs_patients > median_obs_per_patient + 2) %>% 
  group_by(severity) %>% 
  mutate(each_med_obs_per_patient = median(n_obs_patients)) %>% 
  ungroup(severity) %>% 
  select(- c(days_since_admission, value)) %>% 
  distinct() %>% 
  group_by(lab) %>% 
  mutate(across(contains('n_greater'), sum)) %>% 
  select(- c(n_obs_patients, patient_num)) %>% 
  distinct() %>% 
  pivot_wider(names_from = severity, values_from = each_med_obs_per_patient) %>% 
  rename('median_obs_per_severe_patient' = severe,
         'median_obs_per_non_severe_patient' = nonsevere) %>% 
  select(lab, total_obs, total_patients, starts_with('med'), starts_with('n_'))

lab_medians %>% 
  datatable(rownames = FALSE)
```


`n_greater0` shows the number of patients who had more observations than the median.
`n_greater1` shows the number of patients who had more observations than the median + 1.
`n_greater2` shows the number of patients who had more observations than the median + 2.

```{r eval=FALSE, include=FALSE}
lab_medians %>% 
  select(lab, total_obs, total_patients) %>%
  pivot_longer(- lab) %>% 
  ggplot(aes(x = value, y = fct_reorder(lab, value))) +
  geom_col() +
  facet_grid(cols = vars(name), scales = 'free_x', space = 'free') +
  labs(y = NULL)
```

In the figure below:

- Grey dash line: `Reference Low`
- Grey solid line: `Reference High`
- Black dash line: `lower bound outlier` (QC)
- Black solid line: `upper bound outlier` (QC)

```{r fig.width=9, fig.height=11}
patient_obs_long %>% 
  left_join(lab_bounds, by = c('lab' = 'short_name')) %>% 
  ggplot(aes(y = severity, x = value, fill = severity)) +
  geom_violin() +
  scale_fill_brewer(palette = 'Dark2', guide = guide_legend(reverse = TRUE)) +
  labs(y = NULL, x = NULL) +
  geom_vline(aes(xintercept = `Reference Low`), linetype = 'dashed', color = 'grey') +
  geom_vline(aes(xintercept = `Reference High`), color = 'grey') +
  geom_vline(aes(xintercept = LB), linetype = 'dashed') +
  geom_vline(aes(xintercept = UB)) +
  facet_wrap(~ lab, scales = 'free', ncol = 2, strip.position = 'left') +
  theme(axis.text.y = element_blank())
```

## Missing data heatmap

Capped at 20 days with observations.

```{r fig.width=12}
per_lab %>% 
  select(lab, n_obs, both_severities, severe, nonsevere) %>% 
  filter(n_obs <= 20) %>% 
  pivot_longer(c(both_severities, severe, nonsevere)) %>% 
  mutate(name = name %>% fct_recode(
    'All patients' = 'both_severities',
    'Severe patients' = 'severe',
    'Non-severe patients' = 'nonsevere'
  )) %>% 
  ggplot(aes(x = n_obs, fill = value, y = fct_reorder(lab, n_obs))) + 
  geom_tile(colour = "white", size = 0.2)+
  geom_text(aes(label = value), colour = "white", size = 2) +
  scale_y_discrete(expand = c(0, 0))+
  scale_x_continuous(expand = c(0, 0),
                     breaks = c(1:max(per_lab$n_obs)),
                     labels = c(1:max(per_lab$n_obs)))+
  scale_fill_gradient(low = "lightgrey", high = "darkblue",
                      limits = c(0, max(per_lab$both_severities))) +
  facet_wrap(~ name, nrow = 1) +
  labs(x = 'Number of values a patient has for each lab',
       y = NULL, fill = '# patients') +
  theme(panel.grid.major = element_blank(),
        legend.position = c(0.93, 0.2),
        axis.ticks.y = element_blank()
  )
```

"Binned" heatmap: every 15 days

```{r fig.width=12}
per_lab %>%
  mutate(obs_bin = cut(n_obs, breaks = 15, include.lowest = TRUE)) %>% 
  group_by(lab, obs_bin) %>% 
  summarise(both_severities = sum(both_severities), 
            severe = sum(severe), 
            nonsevere = sum(nonsevere), 
            .groups = 'drop') %>% 
  select(lab, obs_bin, both_severities, severe, nonsevere) %>% 
  pivot_longer(c(both_severities, severe, nonsevere)) %>% 
  mutate(name = name %>% fct_recode(
    'All patients' = 'both_severities',
    'Severe patients' = 'severe',
    'Non-severe patients' = 'nonsevere'
  )) %>% 
  ggplot(aes(x = obs_bin, fill = value, y = fct_reorder(lab, value))) + 
  geom_tile(colour = "white", size = 0.2) +
  geom_text(aes(label = value), colour = "white", size = 2) +
  scale_y_discrete(expand = c(0, 0))+
  scale_fill_gradient(low = "lightgrey", high = "darkblue") +
  facet_wrap(~ name, nrow = 1) +
  labs(x = 'Number of values a patient has for each lab',
       y = NULL, fill = '# patients') +
  theme(panel.grid.major = element_blank(),
        legend.position = c(0.93, 0.2),
        axis.text.x = element_text(angle = 90, hjust = 1),
        axis.ticks.y = element_blank()
  )
```

```{r}
boxplot(per_lab$n_obs)
sum(per_lab$n_obs > 90)
```

```{r fig.width=12}
per_lab %>% 
  select(lab, n_obs, prop_both, prop_severe, prop_nonsevere) %>% 
  filter(n_obs <= 20) %>% 
  pivot_longer(c(prop_both, prop_severe, prop_nonsevere)) %>% 
  mutate(name = name %>% fct_recode(
    'Compared to all patients' = 'prop_both',
    'Compared to all severe patients' = 'prop_severe',
    'Compared to all non-severe patients' = 'prop_nonsevere'
  )) %>% 
  ggplot(aes(x = n_obs, fill = value, y = fct_reorder(lab, n_obs))) + 
  geom_tile(colour = "white", size = 0.2)+
  geom_text(aes(label = round(value, 2)), colour = "white", size = 2) +
  scale_y_discrete(expand = c(0, 0)) +
  scale_x_continuous(expand = c(0, 0),
                     breaks = c(1:max(per_lab$n_obs)),
                     labels = c(1:max(per_lab$n_obs)))+ 
  scale_fill_gradient(low = "lightgrey", high = "darkblue",
                      labels = scales::percent_format(accuracy = 1L)) +
  facet_wrap(~ name, nrow = 1) +
  labs(x = 'Number of values a patient has for each lab',
       y = NULL, fill = '% patients') +
  theme(panel.grid.major = element_blank(),
        legend.position = c(0.93, 0.2),
        axis.ticks.y = element_blank()
  )
```

```{r fig.width=12}
per_lab %>%
  mutate(obs_bin = cut(n_obs, breaks = 15, include.lowest = TRUE)) %>% 
  group_by(lab, obs_bin) %>% 
  summarise(prop_both = sum(prop_both), 
            prop_severe = sum(prop_severe), 
            prop_nonsevere = sum(prop_nonsevere), 
            .groups = 'drop') %>% 
  select(lab, obs_bin, prop_both, prop_severe, prop_nonsevere) %>% 
  pivot_longer(c(prop_both, prop_severe, prop_nonsevere)) %>% 
  mutate(name = name %>% fct_recode(
    'Compared to all patients' = 'prop_both',
    'Compared to all severe patients' = 'prop_severe',
    'Compared to all non-severe patients' = 'prop_nonsevere'
  )) %>% 
  ggplot(aes(x = obs_bin, fill = value, y = fct_reorder(lab, value))) + 
  geom_tile(colour = "white", size = 0.2) +
  geom_text(aes(label = round(value, 2)), colour = "white", size = 2) +
  scale_y_discrete(expand = c(0, 0)) +
  scale_fill_gradient(low = "lightgrey", high = "darkblue",
                      labels = scales::percent_format(accuracy = 1L)) +
  facet_wrap(~ name, nrow = 1) +
  labs(x = 'Number of values a patient has for each lab',
       y = NULL, fill = '% patients') +
  theme(panel.grid.major = element_blank(),
        legend.position = c(0.93, 0.2),
        axis.text.x = element_text(angle = 90, hjust = 1),
        axis.ticks.y = element_blank()
  )
```

Denominator: total number of patients, total number of non-severe patients,
and total number of severe patients, respectively.

```{r}
per_lab %>% 
  select(lab, n_obs, severe, nonsevere) %>% 
  filter(n_obs <= 90) %>%
  pivot_longer(c(severe, nonsevere)) %>% 
  mutate(lab = fct_reorder(lab, n_obs),
         name = name %>% fct_recode(
    'Severe patients' = 'severe',
    'Non-severe patients' = 'nonsevere'
  )) %>% 
  ggplot(aes(x = n_obs, fill = name, y = value)) + 
  geom_col() +
  scale_fill_brewer(palette = 'Dark2', direction = -1) +
  facet_wrap(~ lab, scales = 'free') +
  labs(x = 'Number of values a patient has for each lab',
       y = NULL, fill = '# patients') +
  theme(panel.grid.major = element_blank(),
        legend.position = c(0.9, 0.2),
        axis.ticks.y = element_blank()
  )
```


```{r}
sessionInfo()
```
